Using RNA rlog normalized counts, and CAGE rlog normalized:
1. In general RNA and CAGE agrees with each other. bam_RNA seems to lose resolution in lowly expressed region, where CAGE has more spread. CAGE has more sequencing depth, maybe better resolution for lowly expressed genes.
2. do 3D plotting of bam_RNA, HS72_RNA and bam_CAGE to decide where to draw cutoff for off genes. In the end decided that whoever makes it to consensus cluster by tpm filter is ON.

suppressMessages(library(tidyverse))
## Warning: package 'ggplot2' was built under R version 3.3.2
## Warning: package 'tidyr' was built under R version 3.3.2
## Warning: package 'readr' was built under R version 3.3.2
## Warning: package 'stringr' was built under R version 3.3.2
## Warning: package 'forcats' was built under R version 3.3.2
suppressMessages(library(magrittr))
library(plotly)
## Warning: package 'plotly' was built under R version 3.3.2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
options(tibble.width = Inf)
options(scipen=999)

source("~/Dropbox/coding/R/my_package/genomicsDL/R/get_TSS.r")
plot_range=c(-3, 18) 
off_cut_CAGE=4.8
off_cut_RNA=7.5

load RNA gene_id to CAGE consensus cluster conversion

BDGP6_coding_200bp_intersect_consensusCluster <-read.delim("0BDGP6_coding_TSS200bp_intersect_consensusCluster.txt", header=FALSE, stringsAsFactors=FALSE) 

load RNA rlog transformed counts, and take average

rlog <- read.delim("DE_results_rlog_RNA_all.txt", header=T, stringsAsFactors=FALSE) %>% mutate(bam_RNA=(SHS.1.1.HS0hr+SHS.2.1.HS0hr)/2, HS48_RNA=(SHS.1.2.HS48hr+SHS.2.2.HS48hr)/2, HS72_RNA=(SHS.1.3.HS72hr+SHS.2.3.HS72hr)/2, Sa_RNA=(SHS.1.8.Sa+SHS.2.8.Sa)/2, Aly_RNA=(SHS.1.9.Aly+SHS.2.9.Aly)/2 )

take CAGE rlog transformed counts, and take average

CAGE_rlog_BH <- read.delim("DE_normalized_counts_rlog_CAGE_BH_up500_5utr_CDS.txt", stringsAsFactors=FALSE) %>%  mutate(bam_CAGE = (B1 + B2) / 2, HS72_CAGE = (H1 + H2) / 2 ) %>% select(X,bam_CAGE, HS72_CAGE)

merge RNA and CAGE normalized counts

logt_c = inner_join(rlog, BDGP6_coding_200bp_intersect_consensusCluster, by=c("X"="V4")) %>% inner_join(CAGE_rlog_BH, by=c("V10"="X"))

names(logt_c)
##  [1] "X"              "SHS.1.1.HS0hr"  "SHS.1.2.HS48hr" "SHS.1.3.HS72hr"
##  [5] "SHS.1.8.Sa"     "SHS.1.9.Aly"    "SHS.2.1.HS0hr"  "SHS.2.2.HS48hr"
##  [9] "SHS.2.3.HS72hr" "SHS.2.8.Sa"     "SHS.2.9.Aly"    "bam_RNA"       
## [13] "HS48_RNA"       "HS72_RNA"       "Sa_RNA"         "Aly_RNA"       
## [17] "V1"             "V2"             "V3"             "V5"            
## [21] "V6"             "V7"             "V8"             "V9"            
## [25] "V10"            "V11"            "V12"            "V13"           
## [29] "bam_CAGE"       "HS72_CAGE"

plot bam vs HS72

ggplot(logt_c, aes(bam_RNA, HS72_RNA)) +
      xlab("log2(bam_RNA)") + xlim(plot_range) + ylim(plot_range) +
    ylab("log2(HS72_RNA)") +
  coord_fixed(ratio=1) +geom_vline(xintercept = off_cut_RNA, color='red') +
    geom_bin2d(bins = 150) + scale_fill_distiller(palette = "YlGnBu", direction = -1)

ggplot(logt_c, aes(bam_CAGE, HS72_CAGE)) +
      xlab("log2(bam_RNA)") + xlim(plot_range) + ylim(plot_range) +
    ylab("log2(HS72_CAGE)") +
  coord_fixed(ratio=1) +
  #geom_vline(xintercept = off_cut_CAGE, color='red') +
  #geom_hline(yintercept = off_cut_CAGE, color='red') + 
  geom_abline(intercept = 2, slope=1, color='yellow') +
  geom_abline(intercept = 4, slope=1, color='black') +
    geom_bin2d(bins = 150) + scale_fill_distiller(palette = "YlGnBu", direction = -1)
## Warning: Removed 2 rows containing non-finite values (stat_bin2d).

plot bam_CAGE vs bam_RNA

ggplot(logt_c, aes(bam_RNA, bam_CAGE)) +
    geom_point(alpha = 3/10, size = 0.3, color="grey42") +
    xlab("log2(bam_RNA)") + xlim(plot_range) + ylim(plot_range) +
    ylab("log2(bam_CAGE)") +
    coord_fixed(ratio=1) +
    ggtitle("all transcripts") + geom_abline(intercept = 0, slope = 1, color='red') +
    #geom_hline(yintercept = off_cut_CAGE, color='red') +
    theme(plot.title = element_text(hjust = 0.5))  + theme_light()
## Warning: Removed 2 rows containing missing values (geom_point).

ggplot(logt_c, aes(bam_RNA, bam_CAGE)) +
      xlab("log2(bam_RNA)") + xlim(plot_range) + ylim(plot_range) +
    ylab("log2(bam_CAGE)") +
  coord_fixed(ratio=1) +geom_abline(intercept = 0, slope = 1, color='red') +
    geom_bin2d(bins = 150) + scale_fill_distiller(palette = "YlGnBu", direction = -1)
## Warning: Removed 2 rows containing non-finite values (stat_bin2d).

plot HS72_CAGE vs HS72_RNA

ggplot(logt_c, aes(HS72_RNA, HS72_CAGE)) +
    geom_point(alpha = 3/10, size = 0.3, color="grey42") +
    xlab("log2(HS72_RNA)") + xlim(plot_range) + ylim(plot_range) +
    ylab("log2(HS72_CAGE)") +
    coord_fixed(ratio=1) +
    ggtitle("all transcripts") +geom_abline(intercept = 0, slope = 1, color='red') +   
  #geom_hline(yintercept = off_cut_CAGE, color='red') +
    theme(plot.title = element_text(hjust = 0.5))  + theme_light()
## Warning: Removed 1 rows containing missing values (geom_point).

ggplot(logt_c, aes(HS72_RNA, HS72_CAGE)) +
  xlab("log2(HS72_RNA)") + xlim(plot_range) + ylim(plot_range) +
    ylab("log2(HS72_CAGE)") +
    coord_fixed(ratio=1) +geom_abline(intercept = 0, slope = 1, color='red') +
    geom_bin2d(bins = 150) + scale_fill_distiller(palette = "YlGnBu", direction = -1)
## Warning: Removed 1 rows containing non-finite values (stat_bin2d).

p=plot_ly(slice(logt_c,1:5000), x=~bam_RNA, y=~HS72_RNA, z=~bam_CAGE, opacity=0.5, marker=list(size=2), mode="markers", type="scatter3d") #%>% add_trace(x=c(-4,4), y=c(-4,4), z=c(-4,4), mode="lines")
p
p=plot_ly(slice(logt_c,1:5000), x=~bam_RNA, y=~HS72_RNA, z=~HS72_CAGE, opacity=0.5, marker=list(size=2), mode="markers", type="scatter3d") #%>% add_trace(x=c(-4,4), y=c(-4,4), z=c(-4,4), mode="lines")
p

number of off genes by both RNA and CAGE

bam_CAGE_off=filter(logt_c, bam_CAGE <= off_cut_CAGE) %>% select(X) %>% unique()
nrow(bam_CAGE_off)
## [1] 2622
nrow(unique(filter(logt_c, bam_CAGE <= off_cut_CAGE & bam_RNA <= off_cut_RNA)))
## [1] 2008

what is this group that bam_RNA is on and not changed much in HS72, but bam_CAGE is off, which has more genes comparing to 7.DEnCAGE

these are the alternative TSS usage ones!

nrow(unique(filter(logt_c, bam_CAGE < off_cut_CAGE & bam_RNA > 10)))
## [1] 727
nrow(unique(select(filter(logt_c, bam_CAGE < off_cut_CAGE & bam_RNA > 10),X)))
## [1] 529
x=filter(logt_c, bam_CAGE < 0 & bam_RNA > 10 & bam_RNA < 12 & HS72_RNA >10 & HS72_RNA < 12) 

x_bed=select(x,V7:V12)

#write.table(x_bed,"bam_CAGE_off_RNA_on.bed", sep="\t", quote=F, row.names = F, col.names = F)

what is this group that HS72_RNA is on but HS72_CAGE is off

very few genes. maybe just for some reason didn’t pull by cap in CAGE?

nrow(unique(filter(logt_c, HS72_CAGE < 2 & HS72_RNA > off_cut_RNA)))
## [1] 13
nrow(unique(select(filter(logt_c, HS72_CAGE < 2 & HS72_RNA > off_cut_RNA),X)))
## [1] 12
y=filter(logt_c, HS72_CAGE < 2 & HS72_RNA > off_cut_RNA) 

y_bed=select(y,V7:V12)

#write.table(y_bed,"HS72_CAGE_off_RNA_on.bed", sep="\t", quote=F, row.names = F, col.names = F)

what is the group that bam/HS72 RNA is off, CAGE is on?

I thought that they are below detection level for RNAseq, BUT they are the multimappers that I failed to filter out of CAGE Q255.chr.bam!!!!! Fixed in this version.

nrow(unique(select(filter(logt_c, bam_RNA < 1 & bam_CAGE > off_cut_CAGE),X)))
## [1] 3
nrow(unique(select(filter(logt_c, HS72_RNA < 1 & HS72_CAGE > off_cut_CAGE),X)))
## [1] 1
zb=filter(logt_c, bam_RNA < 1 & bam_CAGE > off_cut_CAGE) %>% select(V7:V12)
zh=filter(logt_c, HS72_RNA < 1 & HS72_CAGE > off_cut_CAGE)%>% select(V7:V12)

nrow(unique(inner_join(select(filter(logt_c, bam_RNA < 1 & bam_CAGE > off_cut_CAGE),X), select(filter(logt_c, HS72_RNA < 1 & HS72_CAGE > off_cut_CAGE),X))))
## Joining, by = "X"
## [1] 0
#write.table(zb, "bam_RNA_off_CAGE_on.bed", sep="\t", quote=F, row.names = F, col.names = F)

#write.table(zh, "HS72_RNA_off_CAGE_on.bed", sep="\t", quote=F, row.names = F, col.names = F)
sessionInfo()
## R version 3.3.1 (2016-06-21)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.11.6 (El Capitan)
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] plotly_4.7.1    magrittr_1.5    forcats_0.2.0   stringr_1.2.0  
##  [5] dplyr_0.8.0.1   purrr_0.3.1     readr_1.1.1     tidyr_0.7.2    
##  [9] tibble_2.0.1    ggplot2_2.2.1   tidyverse_1.2.1
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_0.2.5    reshape2_1.4.2      haven_1.1.0        
##  [4] lattice_0.20-35     colorspace_1.3-2    htmltools_0.3.6    
##  [7] viridisLite_0.3.0   yaml_2.1.14         rlang_0.3.1        
## [10] pillar_1.3.1        foreign_0.8-69      glue_1.3.0         
## [13] RColorBrewer_1.1-2  modelr_0.1.1        readxl_1.0.0       
## [16] plyr_1.8.4          munsell_0.4.3       gtable_0.2.0       
## [19] cellranger_1.1.0    rvest_0.3.2         htmlwidgets_0.9    
## [22] psych_1.7.8         evaluate_0.10.1     labeling_0.3       
## [25] knitr_1.17          httpuv_1.3.5        crosstalk_1.0.0    
## [28] parallel_3.3.1      broom_0.4.2         Rcpp_1.0.0         
## [31] xtable_1.8-2        scales_0.5.0        backports_1.1.1    
## [34] jsonlite_1.5        mime_0.5            mnormt_1.5-5       
## [37] hms_0.3             digest_0.6.12       stringi_1.1.5      
## [40] shiny_1.0.5         grid_3.3.1          rprojroot_1.2      
## [43] cli_1.0.1           tools_3.3.1         lazyeval_0.2.1     
## [46] crayon_1.3.4        pkgconfig_2.0.2     data.table_1.10.4-3
## [49] xml2_1.1.1          lubridate_1.7.1     assertthat_0.2.0   
## [52] rmarkdown_1.6       httr_1.3.1          rstudioapi_0.7     
## [55] R6_2.2.2            nlme_3.1-131